'''
http://word.iciba.com/?action=words&class=13&course=
'''


import sys
from bs4 import BeautifulSoup # 网页解析 获取数据
import re   # 正则表达式，进行文字匹配
import urllib.request, urllib.error # 指定url，获取网页数据
import xlwt   # 写入excel
import pymysql # 数据库操作



Name = re.compile(r'<span title="(.*?)">') # 单词

Phonetic = re.compile(r'\s+') # 去空格，制表符，换行符

Phonetic_ = re.compile(r'<stronglang="EN-US"xml:lang="EN-US">(.*?)</strong>') # 音标

Tp = re.compile(r'<spanstyle="display:block;"title="(.*?)\.') # 类型

Info = re.compile(r'<spanstyle="display:block;"title=".*?\.(.*?)">') # 中文翻译


# 提取指定数据
def getData(baseurl):
    datalist = []
    count = 0
    for i in range(0, 275):
        url = baseurl + str(count+1)
        count += 1
        html = askURL(url)
        soup = BeautifulSoup(html, "html.parser")
        for item in soup.find_all('ul', class_="word_main_list"):
            item = str(item)
            data = []
            names = re.findall(Name, item)
            data.append(names)
            phonetic = re.sub(Phonetic, '', item)
            phonetic = str(phonetic)
            phonetis = re.findall(Phonetic_, phonetic)
            data.append(phonetis)
            type_ = re.findall(Tp, phonetic)
            data.append(type_)
            info = re.findall(Info, phonetic)
            data.append(info)
            datalist.append(data)
            for i in datalist:
                list1 = i[0]
                list2 = i[1]
                list3 = i[2]
                list4 = i[3]
            saveDataDB(list1, list2, list3, list4)


# 得到指定一个URL网页内容
def askURL(url):
    head = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
    }
    request = urllib.request.Request(url, headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return html

# mysql保存数据
def saveDataDB(list1, list2, list3, list4):
    conn = pymysql.connect(host='127.0.0.1', port=3306, user='python', password='Mysql_123', db='python', charset='utf8')
    cur = conn.cursor()
    for i in range(len(list1)):

        sql = '''
                    insert into Words (
                     word,phonetic,type_,info) 
                values(("%s"),("%s"),("%s"),("%s"))''' % (list1[i], list2[i], list3[i], list4[i])

        cur.execute(sql)
        conn.commit()
    cur.close()
    conn.close()
    print('保存数据库成功！！')

# 创建数据表
def init_db():
    sql = '''
        create table Words
        (
        id int PRIMARY KEY AUTO_INCREMENT,
        word char(100) ,
        phonetic char(100),
        type_ char(100),
        info text
        )ENGINE=InnoDB  DEFAULT CHARSET=utf8 AUTO_INCREMENT=1;              
    '''  # 创建表结构
    conn = pymysql.connect(host='127.0.0.1', port=3306, user='python', password='Mysql_123', db='python', charset='utf8')
    cursor = conn.cursor()
    cursor.execute(sql)
    conn.commit()
    conn.close()

# 主函数
def main():
    baseurl = "http://word.iciba.com/?action=words&class=13&course="
    init_db()
    getData(baseurl)



if __name__ == "__main__":
    main()
